library(mfp)
## Loading required package: survival
library(lattice)
library(AUC)
## AUC 0.3.0
## Type AUCNews() to see the change log and ?AUC to get an overview.
library(plyr)
library(MASS)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
library(mvtnorm)
library(car)
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'caret'
## The following objects are masked from 'package:AUC':
##
## sensitivity, specificity
## The following object is masked from 'package:survival':
##
## cluster
Попарные диаграммы рассеяния всех количественных признаков:
wine_data = read.csv('wine.csv')
colnames(wine_data) = c("X","type","constant_acidity","acetic_acidity","critic_acid_amount","residual_sugar","chlorides","free_sulfur_dioxide","total_sulfur_dioxide","density","pH","sulphates","alcohol","grade")
wine_data$X = NULL
wine_data$type = as.integer(wine_data$type == "красное")
panel.hist <- function(x, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "red", ...)
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
mycol <- rgb(30,30,30,100,maxColorValue=255)
panel.dots <- function(x, y, ...){
points(x, y, pch=19, col=mycol)
}
wine_data$below_grade = as.integer(wine_data$grade < 6)
wine_data$above_grade = as.integer(wine_data$grade > 6)
wine_data$grade = NULL
pairs(wine_data[,c("constant_acidity","acetic_acidity","critic_acid_amount","residual_sugar","chlorides","free_sulfur_dioxide","total_sulfur_dioxide","density","pH","sulphates","alcohol")], diag.panel=panel.hist,upper.panel = panel.cor, lower.panel = panel.dots)
Видна корелляция между содержанием алкоголя и плотностью.
Посмотрим на распределение занчений целевой переменной.
#par(mfrow=c(1,2))
hist(wine_data$alcohol, col="red", main="", xlab="alcohol", breaks=50)
Видно что оно не очень нормальное – надо примениить преобразование отклика методом Бокса-Кокса. Кроме того, сразу видим, что имеется выброс с максимальным значением по признаку алкогольности, выбросим его сразу перед применением преобразования.
После этих действий, снова построим распределение преобразованной алкогольности.
wine_data = subset(wine_data, wine_data$alcohol != 14.9) # удаляем как выброс
wine_data$alcohol = predict(BoxCoxTrans(wine_data$alcohol), wine_data$alcohol)
hist(wine_data$alcohol, col="red", main="", xlab="alcohol", breaks=50)
После применения преобразования распределение стало больше похоже на нормальное.
Теперь построим линейную модель по всем признакам:
#Создание модели
m1 = lm(alcohol ~., data=wine_data)
##Значимость признаков
get_multi_hyotesys_corrected_p_value = function(m, EstType){
beta = coef(m)[-1]
Vbeta = vcovHC(m, type = EstType)[-1,-1]
D = diag(1 / sqrt(diag(Vbeta)))
t = D %*% beta
Cor = D %*% Vbeta %*% t(D)
m.df = length(m$residuals) - length(beta)
p_adj = sapply(abs(t), function(x) 1-pmvt(-rep(x, length(beta)), rep(x, length(beta)), corr = Cor, df = m.df))
c(NaN, p_adj)
}
s1 <-summary(m1)
s1$coefficients <- cbind(s1$coefficients, get_multi_hyotesys_corrected_p_value(m1, "HC0"))
dimnames(s1$coefficients)[[2]][5] <- "Adjusted p-value"
s1
##
## Call:
## lm(formula = alcohol ~ ., data = wine_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0045807 -0.0003954 -0.0000098 0.0003744 0.0206682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.373e+00 7.421e-03 1.851e+02 0.000e+00
## type 1.423e-03 5.016e-05 2.838e+01 4.350e-167
## constant_acidity 7.166e-04 1.212e-05 5.912e+01 0.000e+00
## acetic_acidity 8.289e-04 7.828e-05 1.059e+01 5.476e-26
## critic_acid_amount 4.996e-04 7.528e-05 6.637e+00 3.471e-11
## residual_sugar 2.812e-04 4.097e-06 6.863e+01 0.000e+00
## chlorides -1.582e-03 3.178e-04 -4.977e+00 6.634e-07
## free_sulfur_dioxide -4.330e-06 7.286e-07 -5.943e+00 2.952e-09
## total_sulfur_dioxide -4.067e-07 3.088e-07 -1.317e+00 1.879e-01
## density -8.503e-01 7.638e-03 -1.113e+02 0.000e+00
## pH 3.619e-03 7.387e-05 4.900e+01 0.000e+00
## sulphates 1.341e-03 7.111e-05 1.886e+01 2.642e-77
## below_grade -2.612e-04 2.102e-05 -1.243e+01 4.614e-35
## above_grade 2.559e-05 2.476e-05 1.034e+00 3.014e-01
## Adjusted p-value
## (Intercept) NA
## type 0.000
## constant_acidity 0.000
## acetic_acidity 0.003
## critic_acid_amount 0.003
## residual_sugar 0.000
## chlorides 0.023
## free_sulfur_dioxide 0.000
## total_sulfur_dioxide 0.998
## density 0.000
## pH 0.000
## sulphates 0.000
## below_grade 0.000
## above_grade 0.987
##
## Residual standard error: 0.0006971 on 6482 degrees of freedom
## Multiple R-squared: 0.8117, Adjusted R-squared: 0.8113
## F-statistic: 2149 on 13 and 6482 DF, p-value: < 2.2e-16
Для отсева выбросов строим график расстояния Кука.
##Расстояние кука
plot(fitted(m1), cooks.distance(m1), xlab="Fitted alcohol", ylab="Cook's distance")
lines(c(0,100), c(0.015, 0.015), col="red")
plot(wine_data$alcohol, cooks.distance(m1), xlab="alcohol", ylab="Cook's distance")
lines(c(0,100), c(0.015, 0.015), col="red")
Видим выбросы и пытаемся отфильтровать их по расстоянию кука, построив после этого снова графиик расстояния Кука для новой модели.
## [1] 4469
Теперь можно считать, что выбросов нет.
Построим значения трех критериев для остатков модели.
| Критерий | p |
|---|---|
| Шапиро-Уилка | 5.052935410^{-15} |
| Уилкоксона | 0.6866825 |
| Бройша-Пагана | 5.531778610^{-28} |
Остатки ненормальны и гетероскедастичны, поэтому оценку значимости признаков будем делать с дисперсиями Уайта. Также будем делать поправку на множественность.
Посмотрим на графики остатков и попытаемся найти квадратичные зависимости:
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.0022e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
Видим, что у признаков residual_sugar и chlorides тренды зависимостей похожи на квадратичные, поэтому добавим это в новую модель.
m2 <- lm(alcohol ~ . + I(residual_sugar^2) + I(chlorides^2) , data=wine_data)
Произведем сравнение моделей по критерию Вальда с дисперсиями Уайта:
#Сравнение моделей
waldtest(m2, m1, vcov = vcovHC(m2, type = "HC0"))
## Wald test
##
## Model 1: alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount +
## residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide +
## density + pH + sulphates + below_grade + above_grade + I(residual_sugar^2) +
## I(chlorides^2)
## Model 2: alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount +
## residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide +
## density + pH + sulphates + below_grade + above_grade
## Res.Df Df F Pr(>F)
## 1 4453
## 2 4455 -2 16.483 7.38e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Модель получилась значимо лучше.
Вот графики ошибок новой модели:
##Графики ошибок
plot(1:dim(wine_data)[1], rstandard(m2), xlab="i", ylab="Standardized residuals", col=mycol, pch=19)
addtrend(1:dim(wine_data)[1], rstandard(m2))
grid()
plot(fitted(m2), rstandard(m2), xlab="Fitted values", ylab="Standardized residuals", col=mycol, pch=19)
addtrend(fitted(m2), rstandard(m2))
grid()
for (col_name in colnames(wine_data)) {
if (col_name != "alcohol") {
plot(wine_data[,col_name], rstandard(m2), xlab=col_name, ylab="Standardized residuals", col=mycol, pch=19)
addtrend(wine_data[,col_name], rstandard(m2))
grid()
}
}
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.0022e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
Посмотрим на значимость признаков полученной модели, не забывая про поправку на множественность:
##Значимость признаков второй модели
s1 =summary(m2)
s1$coefficients = cbind(s1$coefficients, get_multi_hyotesys_corrected_p_value(m2, "HC0"))
dimnames(s1$coefficients)[[2]][5] = "Adjusted p-value"
s1
##
## Call:
## lm(formula = alcohol ~ . + I(residual_sugar^2) + I(chlorides^2),
## data = wine_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.902e-04 -2.335e-04 -9.520e-06 2.339e-04 9.044e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.432e+00 4.563e-03 3.138e+02 0.000e+00
## type 1.660e-03 3.169e-05 5.239e+01 0.000e+00
## constant_acidity 7.420e-04 7.386e-06 1.005e+02 0.000e+00
## acetic_acidity 5.316e-04 4.875e-05 1.090e+01 2.413e-27
## critic_acid_amount 3.661e-04 4.539e-05 8.066e+00 9.293e-16
## residual_sugar 3.162e-04 4.343e-06 7.281e+01 0.000e+00
## chlorides 3.660e-04 5.033e-04 7.271e-01 4.672e-01
## free_sulfur_dioxide -4.186e-06 4.277e-07 -9.787e+00 2.157e-22
## total_sulfur_dioxide 9.427e-08 1.833e-07 5.144e-01 6.070e-01
## density -9.092e-01 4.692e-03 -1.938e+02 0.000e+00
## pH 3.531e-03 4.291e-05 8.228e+01 0.000e+00
## sulphates 1.332e-03 4.356e-05 3.058e+01 1.283e-186
## below_grade -2.054e-04 1.201e-05 -1.709e+01 1.723e-63
## above_grade -3.599e-05 1.321e-05 -2.724e+00 6.465e-03
## I(residual_sugar^2) -1.021e-06 2.273e-07 -4.490e+00 7.287e-06
## I(chlorides^2) -3.308e-03 1.520e-03 -2.177e+00 2.953e-02
## Adjusted p-value
## (Intercept) NA
## type 0.000
## constant_acidity 0.000
## acetic_acidity 0.000
## critic_acid_amount 0.000
## residual_sugar 0.000
## chlorides 0.996
## free_sulfur_dioxide 0.000
## total_sulfur_dioxide 1.000
## density 0.000
## pH 0.000
## sulphates 0.000
## below_grade 0.000
## above_grade 0.040
## I(residual_sugar^2) 0.000
## I(chlorides^2) 0.009
##
## Residual standard error: 0.0003165 on 4453 degrees of freedom
## Multiple R-squared: 0.9579, Adjusted R-squared: 0.9578
## F-statistic: 6755 on 15 and 4453 DF, p-value: < 2.2e-16
add1(m2, ~ .^2, test="F")
## Single term additions
##
## Model:
## alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount +
## residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide +
## density + pH + sulphates + below_grade + above_grade + I(residual_sugar^2) +
## I(chlorides^2)
## Df Sum of Sq RSS AIC
## <none> 0.00044609 -72008
## type:constant_acidity 1 9.0000e-10 0.00044609 -72006
## type:acetic_acidity 1 5.2000e-08 0.00044604 -72006
## type:critic_acid_amount 1 6.4400e-08 0.00044602 -72007
## type:residual_sugar 1 6.1300e-08 0.00044603 -72007
## type:chlorides 1 2.4500e-08 0.00044606 -72006
## type:free_sulfur_dioxide 1 3.5680e-07 0.00044573 -72009
## type:total_sulfur_dioxide 1 1.8643e-06 0.00044422 -72025
## type:density 1 6.0290e-07 0.00044549 -72012
## type:pH 1 2.1271e-06 0.00044396 -72027
## type:sulphates 1 2.6703e-06 0.00044342 -72033
## type:below_grade 1 6.5050e-07 0.00044544 -72012
## type:above_grade 1 1.1543e-06 0.00044493 -72017
## type:I(residual_sugar^2) 1 4.6600e-08 0.00044604 -72006
## type:I(chlorides^2) 1 2.8200e-08 0.00044606 -72006
## constant_acidity:acetic_acidity 1 4.1000e-09 0.00044608 -72006
## constant_acidity:critic_acid_amount 1 5.2030e-07 0.00044557 -72011
## constant_acidity:residual_sugar 1 1.4200e-08 0.00044607 -72006
## constant_acidity:chlorides 1 0.0000e+00 0.00044609 -72006
## constant_acidity:free_sulfur_dioxide 1 2.6450e-07 0.00044582 -72009
## constant_acidity:total_sulfur_dioxide 1 2.8880e-07 0.00044580 -72009
## constant_acidity:density 1 4.5200e-08 0.00044604 -72006
## constant_acidity:pH 1 3.5010e-07 0.00044574 -72009
## constant_acidity:sulphates 1 9.7960e-07 0.00044511 -72016
## constant_acidity:below_grade 1 8.2330e-07 0.00044526 -72014
## constant_acidity:above_grade 1 1.0874e-06 0.00044500 -72017
## constant_acidity:I(residual_sugar^2) 1 7.8000e-09 0.00044608 -72006
## constant_acidity:I(chlorides^2) 1 1.0700e-08 0.00044608 -72006
## acetic_acidity:critic_acid_amount 1 4.6120e-07 0.00044563 -72011
## acetic_acidity:residual_sugar 1 3.1000e-09 0.00044608 -72006
## acetic_acidity:chlorides 1 6.6000e-08 0.00044602 -72007
## acetic_acidity:free_sulfur_dioxide 1 2.6960e-07 0.00044582 -72009
## acetic_acidity:total_sulfur_dioxide 1 1.4815e-06 0.00044461 -72021
## acetic_acidity:density 1 5.1800e-07 0.00044557 -72011
## acetic_acidity:pH 1 9.9970e-07 0.00044509 -72016
## acetic_acidity:sulphates 1 7.3010e-07 0.00044536 -72013
## acetic_acidity:below_grade 1 1.3530e-07 0.00044595 -72007
## acetic_acidity:above_grade 1 9.5900e-08 0.00044599 -72007
## acetic_acidity:I(residual_sugar^2) 1 2.1210e-07 0.00044588 -72008
## acetic_acidity:I(chlorides^2) 1 1.4100e-08 0.00044607 -72006
## critic_acid_amount:residual_sugar 1 1.9400e-08 0.00044607 -72006
## critic_acid_amount:chlorides 1 2.5000e-09 0.00044609 -72006
## critic_acid_amount:free_sulfur_dioxide 1 7.6400e-08 0.00044601 -72007
## critic_acid_amount:total_sulfur_dioxide 1 1.2800e-08 0.00044608 -72006
## critic_acid_amount:density 1 2.2000e-09 0.00044609 -72006
## critic_acid_amount:pH 1 1.3840e-07 0.00044595 -72007
## critic_acid_amount:sulphates 1 4.1800e-08 0.00044605 -72006
## critic_acid_amount:below_grade 1 2.0570e-07 0.00044588 -72008
## critic_acid_amount:above_grade 1 2.0840e-07 0.00044588 -72008
## critic_acid_amount:I(residual_sugar^2) 1 7.7000e-09 0.00044608 -72006
## critic_acid_amount:I(chlorides^2) 1 0.0000e+00 0.00044609 -72006
## residual_sugar:chlorides 1 3.0080e-07 0.00044579 -72009
## residual_sugar:free_sulfur_dioxide 1 2.0170e-07 0.00044589 -72008
## residual_sugar:total_sulfur_dioxide 1 8.8880e-07 0.00044520 -72015
## residual_sugar:density 1 2.5604e-06 0.00044353 -72032
## residual_sugar:pH 1 4.5410e-07 0.00044563 -72010
## residual_sugar:sulphates 1 2.1600e-08 0.00044607 -72006
## residual_sugar:below_grade 1 2.2270e-07 0.00044587 -72008
## residual_sugar:above_grade 1 6.5870e-07 0.00044543 -72013
## residual_sugar:I(residual_sugar^2) 1 1.1980e-07 0.00044597 -72007
## residual_sugar:I(chlorides^2) 1 7.0900e-08 0.00044602 -72007
## chlorides:free_sulfur_dioxide 1 3.1020e-07 0.00044578 -72009
## chlorides:total_sulfur_dioxide 1 8.4930e-07 0.00044524 -72014
## chlorides:density 1 0.0000e+00 0.00044609 -72006
## chlorides:pH 1 3.4618e-06 0.00044263 -72041
## chlorides:sulphates 1 1.3622e-06 0.00044473 -72020
## chlorides:below_grade 1 9.1760e-07 0.00044517 -72015
## chlorides:above_grade 1 1.2676e-06 0.00044482 -72019
## chlorides:I(residual_sugar^2) 1 4.8090e-07 0.00044561 -72011
## chlorides:I(chlorides^2) 1 2.1400e-08 0.00044607 -72006
## free_sulfur_dioxide:total_sulfur_dioxide 1 3.4100e-08 0.00044605 -72006
## free_sulfur_dioxide:density 1 6.8800e-07 0.00044540 -72013
## free_sulfur_dioxide:pH 1 4.2410e-07 0.00044566 -72010
## free_sulfur_dioxide:sulphates 1 9.9770e-07 0.00044509 -72016
## free_sulfur_dioxide:below_grade 1 9.5000e-09 0.00044608 -72006
## free_sulfur_dioxide:above_grade 1 2.6000e-09 0.00044609 -72006
## free_sulfur_dioxide:I(residual_sugar^2) 1 2.3290e-07 0.00044586 -72008
## free_sulfur_dioxide:I(chlorides^2) 1 1.0420e-07 0.00044598 -72007
## total_sulfur_dioxide:density 1 5.3754e-06 0.00044071 -72060
## total_sulfur_dioxide:pH 1 1.1000e-09 0.00044609 -72006
## total_sulfur_dioxide:sulphates 1 8.2600e-07 0.00044526 -72014
## total_sulfur_dioxide:below_grade 1 1.6900e-08 0.00044607 -72006
## total_sulfur_dioxide:above_grade 1 1.1300e-08 0.00044608 -72006
## total_sulfur_dioxide:I(residual_sugar^2) 1 3.5260e-07 0.00044574 -72009
## total_sulfur_dioxide:I(chlorides^2) 1 1.6630e-07 0.00044592 -72008
## density:pH 1 7.2215e-06 0.00043887 -72079
## density:sulphates 1 3.5252e-06 0.00044256 -72041
## density:below_grade 1 1.3545e-06 0.00044473 -72019
## density:above_grade 1 2.1661e-06 0.00044392 -72028
## density:I(residual_sugar^2) 1 1.2864e-06 0.00044480 -72019
## density:I(chlorides^2) 1 9.5800e-08 0.00044599 -72007
## pH:sulphates 1 5.5670e-07 0.00044553 -72011
## pH:below_grade 1 1.5486e-06 0.00044454 -72021
## pH:above_grade 1 2.3691e-06 0.00044372 -72030
## pH:I(residual_sugar^2) 1 4.9210e-07 0.00044560 -72011
## pH:I(chlorides^2) 1 1.0284e-06 0.00044506 -72016
## sulphates:below_grade 1 2.9220e-07 0.00044580 -72009
## sulphates:above_grade 1 7.7900e-08 0.00044601 -72007
## sulphates:I(residual_sugar^2) 1 4.7500e-08 0.00044604 -72006
## sulphates:I(chlorides^2) 1 2.0970e-07 0.00044588 -72008
## below_grade:above_grade 0 0.0000e+00 0.00044609 -72008
## below_grade:I(residual_sugar^2) 1 9.2000e-09 0.00044608 -72006
## below_grade:I(chlorides^2) 1 1.9040e-07 0.00044590 -72008
## above_grade:I(residual_sugar^2) 1 2.7230e-07 0.00044582 -72009
## above_grade:I(chlorides^2) 1 1.6370e-07 0.00044592 -72008
## I(residual_sugar^2):I(chlorides^2) 1 1.3330e-07 0.00044595 -72007
## F value Pr(>F)
## <none>
## type:constant_acidity 0.0094 0.9226003
## type:acetic_acidity 0.5194 0.4711304
## type:critic_acid_amount 0.6426 0.4228245
## type:residual_sugar 0.6118 0.4341398
## type:chlorides 0.2442 0.6212160
## type:free_sulfur_dioxide 3.5640 0.0591111 .
## type:total_sulfur_dioxide 18.6842 1.576e-05 ***
## type:density 6.0251 0.0141415 *
## type:pH 21.3301 3.975e-06 ***
## type:sulphates 26.8106 2.343e-07 ***
## type:below_grade 6.5013 0.0108128 *
## type:above_grade 11.5494 0.0006836 ***
## type:I(residual_sugar^2) 0.4652 0.4952471
## type:I(chlorides^2) 0.2819 0.5955113
## constant_acidity:acetic_acidity 0.0408 0.8400131
## constant_acidity:critic_acid_amount 5.1982 0.0226571 *
## constant_acidity:residual_sugar 0.1420 0.7062802
## constant_acidity:chlorides 0.0002 0.9891574
## constant_acidity:free_sulfur_dioxide 2.6412 0.1041974
## constant_acidity:total_sulfur_dioxide 2.8845 0.0895055 .
## constant_acidity:density 0.4513 0.5017584
## constant_acidity:pH 3.4972 0.0615399 .
## constant_acidity:sulphates 9.7982 0.0017581 **
## constant_acidity:below_grade 8.2314 0.0041366 **
## constant_acidity:above_grade 10.8792 0.0009801 ***
## constant_acidity:I(residual_sugar^2) 0.0780 0.7800876
## constant_acidity:I(chlorides^2) 0.1072 0.7434242
## acetic_acidity:critic_acid_amount 4.6074 0.0318879 *
## acetic_acidity:residual_sugar 0.0305 0.8613962
## acetic_acidity:chlorides 0.6591 0.4169188
## acetic_acidity:free_sulfur_dioxide 2.6923 0.1009055
## acetic_acidity:total_sulfur_dioxide 14.8344 0.0001190 ***
## acetic_acidity:density 5.1762 0.0229458 *
## acetic_acidity:pH 9.9994 0.0015765 **
## acetic_acidity:sulphates 7.2989 0.0069258 **
## acetic_acidity:below_grade 1.3506 0.2452359
## acetic_acidity:above_grade 0.9576 0.3278464
## acetic_acidity:I(residual_sugar^2) 2.1182 0.1456274
## acetic_acidity:I(chlorides^2) 0.1408 0.7074931
## critic_acid_amount:residual_sugar 0.1941 0.6595702
## critic_acid_amount:chlorides 0.0253 0.8735654
## critic_acid_amount:free_sulfur_dioxide 0.7623 0.3826677
## critic_acid_amount:total_sulfur_dioxide 0.1277 0.7208939
## critic_acid_amount:density 0.0219 0.8823505
## critic_acid_amount:pH 1.3819 0.2398437
## critic_acid_amount:sulphates 0.4172 0.5183860
## critic_acid_amount:below_grade 2.0543 0.1518467
## critic_acid_amount:above_grade 2.0813 0.1491818
## critic_acid_amount:I(residual_sugar^2) 0.0769 0.7815563
## critic_acid_amount:I(chlorides^2) 0.0002 0.9894453
## residual_sugar:chlorides 3.0036 0.0831487 .
## residual_sugar:free_sulfur_dioxide 2.0139 0.1559334
## residual_sugar:total_sulfur_dioxide 8.8883 0.0028855 **
## residual_sugar:density 25.7005 4.149e-07 ***
## residual_sugar:pH 4.5364 0.0332354 *
## residual_sugar:sulphates 0.2156 0.6424614
## residual_sugar:below_grade 2.2239 0.1359596
## residual_sugar:above_grade 6.5836 0.0103246 *
## residual_sugar:I(residual_sugar^2) 1.1963 0.2741307
## residual_sugar:I(chlorides^2) 0.7074 0.4003608
## chlorides:free_sulfur_dioxide 3.0980 0.0784563 .
## chlorides:total_sulfur_dioxide 8.4920 0.0035848 **
## chlorides:density 0.0004 0.9841326
## chlorides:pH 34.8189 3.887e-09 ***
## chlorides:sulphates 13.6366 0.0002245 ***
## chlorides:below_grade 9.1763 0.0024658 **
## chlorides:above_grade 12.6866 0.0003721 ***
## chlorides:I(residual_sugar^2) 4.8043 0.0284399 *
## chlorides:I(chlorides^2) 0.2132 0.6443137
## free_sulfur_dioxide:total_sulfur_dioxide 0.3407 0.5594622
## free_sulfur_dioxide:density 6.8771 0.0087603 **
## free_sulfur_dioxide:pH 4.2368 0.0396136 *
## free_sulfur_dioxide:sulphates 9.9791 0.0015939 **
## free_sulfur_dioxide:below_grade 0.0947 0.7583384
## free_sulfur_dioxide:above_grade 0.0263 0.8712493
## free_sulfur_dioxide:I(residual_sugar^2) 2.3255 0.1273414
## free_sulfur_dioxide:I(chlorides^2) 1.0398 0.3079162
## total_sulfur_dioxide:density 54.3010 2.039e-13 ***
## total_sulfur_dioxide:pH 0.0113 0.9153002
## total_sulfur_dioxide:sulphates 8.2592 0.0040739 **
## total_sulfur_dioxide:below_grade 0.1684 0.6815643
## total_sulfur_dioxide:above_grade 0.1124 0.7374008
## total_sulfur_dioxide:I(residual_sugar^2) 3.5218 0.0606307 .
## total_sulfur_dioxide:I(chlorides^2) 1.6608 0.1975617
## density:pH 73.2575 < 2.2e-16 ***
## density:sulphates 35.4621 2.801e-09 ***
## density:below_grade 13.5594 0.0002339 ***
## density:above_grade 21.7235 3.241e-06 ***
## density:I(residual_sugar^2) 12.8757 0.0003365 ***
## density:I(chlorides^2) 0.9559 0.3282636
## pH:sulphates 5.5627 0.0183901 *
## pH:below_grade 15.5090 8.337e-05 ***
## pH:above_grade 23.7699 1.124e-06 ***
## pH:I(residual_sugar^2) 4.9167 0.0266479 *
## pH:I(chlorides^2) 10.2874 0.0013489 **
## sulphates:below_grade 2.9181 0.0876604 .
## sulphates:above_grade 0.7774 0.3779811
## sulphates:I(residual_sugar^2) 0.4737 0.4913229
## sulphates:I(chlorides^2) 2.0942 0.1479264
## below_grade:above_grade
## below_grade:I(residual_sugar^2) 0.0922 0.7614104
## below_grade:I(chlorides^2) 1.9013 0.1680038
## above_grade:I(residual_sugar^2) 2.7197 0.0991848 .
## above_grade:I(chlorides^2) 1.6348 0.2011104
## I(residual_sugar^2):I(chlorides^2) 1.3309 0.2487134
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Делаем вывод, что лучше удалить признаки above_grade,chlorides,total_sulfur_dioxide, но они могут оказаться хорошими в парах с некоторыми другими признаками. Из вывода выше, подберем самые лучшие пары и внесем их в модель: пары density:above_grade,pH:above_grade,chlorides:pH,type:total_sulfur_dioxide,total_sulfur_dioxide:density
m3 = lm(alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount + residual_sugar + free_sulfur_dioxide + density + pH + sulphates + below_grade + I(residual_sugar^2) + I(chlorides^2) + density*above_grade + pH*above_grade + chlorides*pH + type*total_sulfur_dioxide + total_sulfur_dioxide*density, data=wine_data)
Снова произведем сравнение моделей
##Сравнение моделей
waldtest(m3, m2, vcov = vcovHC(m3, type = "HC0"))
## Wald test
##
## Model 1: alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount +
## residual_sugar + free_sulfur_dioxide + density + pH + sulphates +
## below_grade + I(residual_sugar^2) + I(chlorides^2) + density *
## above_grade + pH * above_grade + chlorides * pH + type *
## total_sulfur_dioxide + total_sulfur_dioxide * density
## Model 2: alcohol ~ type + constant_acidity + acetic_acidity + critic_acid_amount +
## residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide +
## density + pH + sulphates + below_grade + above_grade + I(residual_sugar^2) +
## I(chlorides^2)
## Res.Df Df F Pr(>F)
## 1 4448
## 2 4453 -5 30.167 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Получили значимо лучшую модель.
Вот графики остатков:
#Графики ошибок
plot(1:dim(wine_data)[1], rstandard(m3), xlab="i", ylab="Standardized residuals", col=mycol, pch=19)
addtrend(1:dim(wine_data)[1], rstandard(m3))
grid()
plot(fitted(m3), rstandard(m3), xlab="Fitted values", ylab="Standardized residuals", col=mycol, pch=19)
addtrend(fitted(m3), rstandard(m3))
grid()
for (col_name in colnames(wine_data)) {
if (col_name != "alcohol" && col_name != "above_grade" && col_name != "chlorides" && col_name != "total_sulfur_dioxide") {
plot(wine_data[,col_name], rstandard(m3), xlab=col_name, ylab="Standardized residuals", col=mycol, pch=19)
addtrend(wine_data[,col_name], rstandard(m3))
grid()
}
}
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.0022e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
Считаем, что m3 – самая хорошая модель, которая у нас получилась. Чтобы сделать из нее человеческие выводы, посмотрим на ее коэффициенты
m3$coefficients
## (Intercept) type
## 1.405789e+00 1.675644e-03
## constant_acidity acetic_acidity
## 7.349085e-04 5.447313e-04
## critic_acid_amount residual_sugar
## 4.079470e-04 3.129975e-04
## free_sulfur_dioxide density
## -4.102645e-06 -8.818967e-01
## pH sulphates
## 3.225892e-03 1.366362e-03
## below_grade I(residual_sugar^2)
## -1.898796e-04 -5.226072e-07
## I(chlorides^2) above_grade
## -2.114887e-03 -1.838907e-02
## chlorides total_sulfur_dioxide
## -2.329061e-02 3.056929e-04
## density:above_grade pH:above_grade
## 1.943490e-02 -2.915329e-04
## pH:chlorides type:total_sulfur_dioxide
## 7.336650e-03 -9.881718e-07
## density:total_sulfur_dioxide
## -3.070120e-04
Видим, что наиболее весомый признак – плотность, причем с отрицательнным весом. Это логично, потому что плотность спирта меньше плотности воды, и поэтому этот признак довольно четко связан с алкогольностью. Второй по значимости признак – оценка, из которого мы видим, что чем менее алкогольное вино, тем вероятнее оно получит более высокую оценку.